clean__harmonize_hs_names<-function(years){
  #This function attempts to "harmonize" HS names in different years.
  #We populate a high-school list with the HS's of the latest year (e.g. 2019).
  #Then, for each year, we try to match the HS names to the names in our Hs list.
  #The match happens in a few stages.
  # First, we take a look at town names. If the town a certain school is located in does not appear 
  # in the list of towns in our HS list, then we know we will not find a match; we add these HS directly to the list
  #(hs_list_temp_unmatched). For HS's that do have towns that are already in the list, 2 stages:
  #1. hs_list_temp_matched_perfect: we match names that are exact matches
  #2. hs_list_temp_matched: we use fuzzy matching to match the rest
  #The fuzzy matching is a bit complicated; We first do a very "loose" left join of new HS's on our old HS list.
  #The match retruns many potential matches for each new HS. Then, we start filtering through these matches based on:
  #string distance of the full name, string distance of names between "", string distance of names minus "trivial" words
  # such as "college", "HS", "technical", etc. We keep the best matches in terms of distance, but also look at whether 
  # a HS is the closest match to an existing HS in our list and vice versa. We impose an arbitraty cutoff on the distance required to
  # accept a match. This cutoff depends on the number of HS's in the town. If the town has few HS's, we allow a
  # large distance between names. If the town has many HS's, we are more selective.
  i<-years[1]
  print(i)
  hs_list<-data.frame(judet_bac=character(),town_hs_bac=character(),unitate_de_invatamant=character(),liceu_repartizat_corresponding_to_unitate_de_invatamant=character())
  

  
  #create HS list 2019
  setwd(wd_data_intermediate)
  data_bac_raw<-readRDS(paste("data_merged_complete_town_",i,sep=""))
  message(paste('Observations before:',nrow(data_bac_raw)))
 
  #%>% filter(grepl('MARGHITA',unitate_de_invatamant))

  
  hs_list<-unique(data_bac_raw[,c("judet_bac","town_hs_bac","unitate_de_invatamant","liceu_repartizat_corresponding_to_unitate_de_invatamant")])
  hs_list$school_harmonized<-hs_list$unitate_de_invatamant
  hs_list$yr<-i
  hs_list<-hs_list %>% arrange(judet_bac,town_hs_bac)
  hs_list$dist<-0
 
  
  #hs_list will be the full list of hs and their respective matches;
  #the deduplicated list will (hopefully) show each hs only once; we will use this for matching new (years) of hs's to existing ones
  hs_list_deduplicated<-hs_list
  
  towns<-na.omit(unique(hs_list[,c("judet_bac","town_hs_bac")]))

  
  
for (i in years[2:length(years)]){
    print(i)
     setwd(wd_data_intermediate)
    graduation_file_final<-paste("data_merged_complete_town_",i,sep="")
    data_bac_raw<-readRDS(graduation_file_final)
    message(paste('Observations before:',nrow(data_bac_raw)))
    #%>% filter(grepl('MARGHITA',unitate_de_invatamant))
    data_bac_raw<-data_bac_raw[!is.na(data_bac_raw$judet_bac),]
    
    hs_list_temp<-unique(data_bac_raw[,c("judet_bac","town_hs_bac","unitate_de_invatamant","liceu_repartizat_corresponding_to_unitate_de_invatamant")])
    hs_list_temp$yr<-i
    

    #towns for matching; only use towns that are already in the list in previous years; otherwise, no match
    #is possible, so we will add those to the list later
    towns_temp<-na.omit(unique(hs_list_temp[,c("judet_bac","town_hs_bac")]))
    
    towns_temp_matched<-semi_join(towns_temp[,c("judet_bac","town_hs_bac")],towns[,c("judet_bac","town_hs_bac")],by=c("judet_bac","town_hs_bac"))
    
    #town by town, find exact matches first
    
    list<-lapply(1:dim(towns_temp_matched)[1], function(x) {
      hs_list_new<-hs_list_temp[hs_list_temp$judet_bac==towns_temp_matched$judet_bac[x] & hs_list_temp$town_hs_bac==towns_temp_matched$town_hs_bac[x],]
      hs_list_old<-hs_list[hs_list$judet_bac==towns_temp_matched$judet_bac[x] & hs_list$town_hs_bac==towns_temp_matched$town_hs_bac[x],]
      
      index<-pmatch(gsub('[[:punct:]]+','',hs_list_new$unitate_de_invatamant),
                    gsub('[[:punct:]]+','',hs_list_old$school_harmonized),
                    dup=FALSE)
      
      hs_list_new$school_harmonized<-hs_list_old$school_harmonized[index]
      hs_list_new$liceu_repartizat_from_hs_list<-hs_list_old$liceu_repartizat_corresponding_to_unitate_de_invatamant[index]
      hs_list_new$dist<-0
      hs_list_old_unmatched<-hs_list_old[setdiff((1:dim(hs_list_old)[1]),index),]
      
      list<-list(hs_list_new,hs_list_old_unmatched)
      return(list)
    }
    )
    hs_list_temp_matched_perfect<-as.data.frame(do.call("rbind",lapply(1:length(list), function(x) list[[x]][[1]])))
    hs_list_temp_matched_perfect<-hs_list_temp_matched_perfect[!is.na(hs_list_temp_matched_perfect$judet_bac),]
    hs_list_temp_matched_perfect$dist<--1
    

    hs_list_old_unmatched<-as.data.frame(do.call("rbind",lapply(1:length(list), function(x) list[[x]][[2]])))
    hs_list_old_unmatched<-hs_list_old_unmatched[!is.na(hs_list_old_unmatched$judet_bac),]
    
    
    #town by town, find possible matching hs's for the ones without direct match
    #find towns with unmatched schools (both in the new year and the years already analyzed)
    towns_temp_unmatched<-as.data.frame(hs_list_temp_matched_perfect %>% filter(is.na(school_harmonized)) %>% group_by(judet_bac,town_hs_bac) %>% summarize())
    towns_old_unmatched<-as.data.frame(hs_list_old_unmatched %>% group_by(judet_bac,town_hs_bac) %>% summarize())
    
    #towns with unmatched schools in both lists will be matched using fuzzy matching
    
    towns_temp_unmatched<-semi_join(towns_temp_unmatched[,c("judet_bac","town_hs_bac")],towns_old_unmatched[,c("judet_bac","town_hs_bac")],
                                    by=c("judet_bac","town_hs_bac"))
    
    hs_list_temp_matched<-lapply(1:dim(towns_temp_unmatched)[1], function(x) {
      return<-stringdist_left_join(hs_list_temp_matched_perfect[hs_list_temp_matched_perfect$judet_bac==towns_temp_unmatched$judet_bac[x] & hs_list_temp_matched_perfect$town_hs_bac==towns_temp_unmatched$town_hs_bac[x] & is.na(hs_list_temp_matched_perfect$school_harmonized),!colnames(hs_list_temp_matched_perfect) %in% c('liceu_repartizat_from_hs_list')],
                                   hs_list_old_unmatched[hs_list_old_unmatched$judet_bac==towns_temp_unmatched$judet_bac[x] & hs_list_old_unmatched$town_hs_bac==towns_temp_unmatched$town_hs_bac[x],!colnames(hs_list_old_unmatched) %in% c("dist")],
                                   by=c(unitate_de_invatamant="unitate_de_invatamant"),max_dist=40,distance_col="dist.y")
      if(dim(return)[2]<14 & dim(return)[1]>0){
        return$dist.y<-999
      }
      return(return)
    }
    )
    
    hs_list_temp_matched_perfect<-hs_list_temp_matched_perfect[!is.na(hs_list_temp_matched_perfect$school_harmonized),]
    hs_list_temp_matched_perfect$school_harmonized_original<-  hs_list_temp_matched_perfect$school_harmonized
    #clean the column names, drop useless columns, etc.
    
    
    
    hs_list_temp_matched<-do.call("rbind",hs_list_temp_matched)
    hs_list_temp_matched$dist<-hs_list_temp_matched$dist.y
    hs_list_temp_matched$school_harmonized.x<-as.character(hs_list_temp_matched$unitate_de_invatamant.y)
    hs_list_temp_matched$school_harmonized_original.x<-as.character(hs_list_temp_matched$school_harmonized.y)
    if(dim(hs_list_temp_matched[is.na(hs_list_temp_matched$dist),])[1]>0) {
      hs_list_temp_matched[is.na(hs_list_temp_matched$dist),]$dist<--2
      hs_list_temp_matched[hs_list_temp_matched$dist==-2,]$school_harmonized_original.x<-hs_list_temp_matched[hs_list_temp_matched$dist==-2,]$unitate_de_invatamant.x
      hs_list_temp_matched[hs_list_temp_matched$dist==-2,]$school_harmonized.x<-hs_list_temp_matched[hs_list_temp_matched$dist==-2,]$unitate_de_invatamant.x
      
    } 
    colnames(hs_list_temp_matched)[colnames(hs_list_temp_matched)=='liceu_repartizat_corresponding_to_unitate_de_invatamant.y']<-'liceu_repartizat_from_hs_list'
    hs_list_temp_matched<-hs_list_temp_matched[, -grep("\\.y", colnames(hs_list_temp_matched))]
    colnames(hs_list_temp_matched)<-gsub(names(hs_list_temp_matched), pattern = "\\.x", replacement = "") 
    
    #####
    
    ######
    hs_list_temp_matched<-rbind(hs_list_temp_matched,hs_list_temp_matched_perfect)
    
    
    
    ###################
    ###################
    #school name
    hs_list_temp_matched$unitate_de_invatamant_original<- hs_list_temp_matched$unitate_de_invatamant
    #hs_list_temp_matched$school_harmonized_original<- hs_list_temp_matched$school_harmonized
    
    
    hs_list_temp_matched$dist<-stringdist(hs_list_temp_matched$school_harmonized,hs_list_temp_matched$unitate_de_invatamant)
    
    #Get HS names from " "
    #"unitate de invatamant" name
    hs_list_temp_matched$name_1<-lapply(stringi::stri_extract_all_regex(hs_list_temp_matched$unitate_de_invatamant, '(?<=").*?(?=")'), `[[`, 1)
    hs_list_temp_matched[is.na(hs_list_temp_matched$name_1),]$name_1<-hs_list_temp_matched[is.na(hs_list_temp_matched$name_1),]$unitate_de_invatamant
    string<-c("ORAS ","ORASUL ","MUN\\. ","MUNICIPIUL ","COM\\. ","COMUNA ","SAT ","LOC\\. ","LOCALITATEA")
    hs_list_temp_matched$name_1<-removeWords(unlist(hs_list_temp_matched$name_1), string)
    string<-c("LICEUL ","COLEGIUL NATIONAL ","COLEGIUL ","COLEGIU ","SCOALA ","GRUP SCOLAR ","GRUPUL SCOLAR ","SEMINARUL ","PARTICULAR ","TEHNOLOGIC ","LICEAL ","TEHNIC ","SPECIAL ")
    hs_list_temp_matched$name_1<-removeWords(unlist(hs_list_temp_matched$name_1), string)
    hs_list_temp_matched$name_1<-trimws(str_remove_all(hs_list_temp_matched$name_1, hs_list_temp_matched$town_hs_bac))
    hs_list_temp_matched[hs_list_temp_matched$name_1=="" & !is.na(hs_list_temp_matched$name_1),]$name_1<-str_remove_all(hs_list_temp_matched[hs_list_temp_matched$name_1=="" & !is.na(hs_list_temp_matched$name_1),]$school_harmonized,hs_list_temp_matched[hs_list_temp_matched$name_1=="" & !is.na(hs_list_temp_matched$name_1),]$town_hs_bac)
    
    
    #"school harmonized" name
    #name_2
    hs_list_temp_matched$name_2<-lapply(stringi::stri_extract_all_regex(hs_list_temp_matched$school_harmonized, '(?<=").*?(?=")'), `[[`,1)
    hs_list_temp_matched[is.na(hs_list_temp_matched$name_2),]$name_2<-hs_list_temp_matched[is.na(hs_list_temp_matched$name_2),]$school_harmonized
    string<-c("ORAS ","ORASUL ","MUN\\. ","MUNICIPIUL ","COM\\. ","COMUNA ","SAT ","LOC\\. ","LOCALITATEA")
    hs_list_temp_matched$name_2<-removeWords(unlist(hs_list_temp_matched$name_2), string)
    string<-c("LICEUL ","COLEGIUL NATIONAL ","COLEGIUL ","COLEGIU ","SCOALA ","GRUP SCOLAR ","GRUPUL SCOLAR ","SEMINARUL ","PARTICULAR ","TEHNOLOGIC ","LICEAL ","TEHNIC ","SPECIAL ")
    hs_list_temp_matched$name_2<-removeWords(hs_list_temp_matched$name_2, string)
    hs_list_temp_matched$name_2<-trimws(str_remove_all(hs_list_temp_matched$name_2, hs_list_temp_matched$town_hs_bac))
    hs_list_temp_matched[hs_list_temp_matched$name_2=="" & !is.na(hs_list_temp_matched$name_2),]$name_2<-str_remove_all(hs_list_temp_matched[hs_list_temp_matched$name_2=="" & !is.na(hs_list_temp_matched$name_2),]$school_harmonized,hs_list_temp_matched[hs_list_temp_matched$name_2=="" & !is.na(hs_list_temp_matched$name_2),]$town_hs_bac)
    
    hs_list_temp_matched$dist2<-stringdist(hs_list_temp_matched$name_1,hs_list_temp_matched$name_2)
    
    #"liceu_repartizat" name
    #name_3
    hs_list_temp_matched$name_3<-lapply(stringi::stri_extract_all_regex(hs_list_temp_matched$liceu_repartizat_corresponding_to_unitate_de_invatamant, '(?<=").*?(?=")'), `[[`,1)
    hs_list_temp_matched[is.na(hs_list_temp_matched$name_3),]$name_3<-hs_list_temp_matched[is.na(hs_list_temp_matched$name_3),]$liceu_repartizat_corresponding_to_unitate_de_invatamant
    string<-c("ORAS ","ORASUL ","MUN\\. ","MUNICIPIUL ","COM\\. ","COMUNA ","SAT ","LOC\\. ","LOCALITATEA")
    hs_list_temp_matched$name_3<-removeWords(unlist(hs_list_temp_matched$name_3), string)
    string<-c("LICEUL ","COLEGIUL NATIONAL ","COLEGIUL ","COLEGIU ","SCOALA ","GRUP SCOLAR ","GRUPUL SCOLAR ","SEMINARUL ","PARTICULAR ","TEHNOLOGIC ","LICEAL ","TEHNIC ","SPECIAL ")
    hs_list_temp_matched$name_3<-removeWords(hs_list_temp_matched$name_3, string)
    hs_list_temp_matched$name_3<-trimws(str_remove_all(hs_list_temp_matched$name_3, hs_list_temp_matched$town_hs_bac))
    hs_list_temp_matched[hs_list_temp_matched$name_3=="" & !is.na(hs_list_temp_matched$name_3),]$name_3<-str_remove_all(hs_list_temp_matched[hs_list_temp_matched$name_3=="" & !is.na(hs_list_temp_matched$name_3),]$liceu_repartizat_corresponding_to_unitate_de_invatamant,hs_list_temp_matched[hs_list_temp_matched$name_3=="" & !is.na(hs_list_temp_matched$name_3),]$town_hs_bac)
    
    hs_list_temp_matched$dist3<-stringdist(hs_list_temp_matched$name_1,hs_list_temp_matched$name_3)
    
    
    
    #make sure technical, theory and theology schools are not matched to regular hs's
 #theoretic<-"(TEORETIC)|(COLEGIUL NATIONAL)|(COLEGIU NATIONAL)|(LICEUL PEDAGOGIC)"
    technical<-"(LICEUL TEHNIC)|(LICEUL TEHNOLOGIC)|(LICEUL INDUSTRIAL)|(COLEGIUL TEHNIC)|(COLEGIUL AGRICOL)|(COLEGIUL SILVIC)|(LICEUL UCECOM)|(SCOALA PROFESIONALA)|(LICEUL ECONOMIC)|(GRUP SCOLAR)|(GRUPUL SCOLAR)|(GR\\. SC\\.)"
    theoretic<-"(TEORETIC)|(LICEUL PEDAGOGIC)"
    theologic<-"(TEOLOGIC)|(SEMINAR)|(ORTODOX)|(REFORMAT)|(CATOLIC)|(GRECO)|(CRESTIN)|(PENTICOSTAL)|(ADEVNTIST)|(BAPTIST)"
    sports<-"(SPORTIV)|(ATLETISM)"
    private<-"(PARTICULAR)|(PRIVAT)"
    special<-"(SPECIAL)|(INCLUZIV)"
    arts<-"( DANS)|(COREGRAFIE)|(MUZICA)|(DE ARTA)|(DE ARTE)|(PLASTICE)|(PLASTICA)|(ARTISTIC)"
    
    hs_list_temp_matched$dist<-ifelse((grepl(paste(technical,theoretic,theologic,sports,private,special,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(arts,hs_list_temp_matched$school_harmonized))|
                                        (grepl(arts,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,theologic,sports,private,special,sep="|"),hs_list_temp_matched$school_harmonized)),
                                      100,hs_list_temp_matched$dist)                                                           
    hs_list_temp_matched$dist<-ifelse((grepl(paste(technical,theoretic,theologic,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(sports,hs_list_temp_matched$school_harmonized))|
                                        (grepl(sports,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,theologic,private,special,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                      100,hs_list_temp_matched$dist) 
    hs_list_temp_matched$dist<-ifelse((grepl(paste(technical,theoretic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(theologic,hs_list_temp_matched$school_harmonized))|
                                        (grepl(theologic,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,sports,private,special,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                      100,hs_list_temp_matched$dist)  
    hs_list_temp_matched$dist<-ifelse((grepl(paste(technical,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(theoretic,hs_list_temp_matched$school_harmonized))|
                                        (grepl(theoretic,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                      100,hs_list_temp_matched$dist)  
    hs_list_temp_matched$dist<-ifelse((grepl(paste(theoretic,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(technical,hs_list_temp_matched$school_harmonized))|
                                        (grepl(technical,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                      100,hs_list_temp_matched$dist) 
    hs_list_temp_matched$dist<-ifelse((grepl(paste(theoretic,theologic,sports,private,technical,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(special,hs_list_temp_matched$school_harmonized))|
                                        (grepl(special,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,private,technical,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                      100,hs_list_temp_matched$dist) 
    hs_list_temp_matched$dist<-ifelse((grepl(paste(theoretic,theologic,sports,special,technical,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(private,hs_list_temp_matched$school_harmonized))|
                                        (grepl(private,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,special,technical,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                      100,hs_list_temp_matched$dist)
    #hs_list_temp_matched$dist<-ifelse((grepl(paste(theoretic,theologic,technical,sports,private,special,"(LICEUL)","(COLEGIUL)",arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(technical_pre_2012,hs_list_temp_matched$school_harmonized))|
    #                                    (grepl(technical_pre_2012,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,technical,theologic,sports,private,special,arts,"(LICEUL)","(COLEGIUL)",sep="|"),hs_list_temp_matched$school_harmonized)),
    #                                  100,hs_list_temp_matched$dist) 
    
    
    hs_list_temp_matched$dist2<-ifelse((grepl(paste(technical,theoretic,theologic,sports,private,special,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(arts,hs_list_temp_matched$school_harmonized))|
                                         (grepl(arts,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,theologic,sports,private,special,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist2)                                                           
    hs_list_temp_matched$dist2<-ifelse((grepl(paste(technical,theoretic,theologic,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(sports,hs_list_temp_matched$school_harmonized))|
                                         (grepl(sports,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,theologic,private,special,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist2) 
    hs_list_temp_matched$dist2<-ifelse((grepl(paste(technical,theoretic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(theologic,hs_list_temp_matched$school_harmonized))|
                                         (grepl(theologic,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,sports,private,special,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist2)  
    hs_list_temp_matched$dist2<-ifelse((grepl(paste(technical,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(theoretic,hs_list_temp_matched$school_harmonized))|
                                         (grepl(theoretic,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist2)  
    hs_list_temp_matched$dist2<-ifelse((grepl(paste(theoretic,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(technical,hs_list_temp_matched$school_harmonized))|
                                         (grepl(technical,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist2) 
    hs_list_temp_matched$dist2<-ifelse((grepl(paste(theoretic,theologic,sports,private,technical,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(special,hs_list_temp_matched$school_harmonized))|
                                         (grepl(special,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,private,technical,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist2) 
    hs_list_temp_matched$dist2<-ifelse((grepl(paste(theoretic,theologic,sports,special,technical,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(private,hs_list_temp_matched$school_harmonized))|
                                         (grepl(private,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,special,technical,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist2)
    #hs_list_temp_matched$dist2<-ifelse((grepl(paste(theoretic,theologic,technical,sports,private,special,"(LICEUL)","(COLEGIUL)",arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(technical_pre_2012,hs_list_temp_matched$school_harmonized))|
    #                                    (grepl(technical_pre_2012,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,technical,theologic,sports,private,special,arts,"(LICEUL)","(COLEGIUL)",sep="|"),hs_list_temp_matched$school_harmonized)),
    #                                  100,hs_list_temp_matched$dist2) 
    
    
    
    
    
    
    
    
    
    ####################
    ####################
    #2nd type of names:
    #remove some words from name
    words_to_remove<-c("LICEUL", "COLEGIUL", "NATIONAL", "COLEGIU",
                       "SCOALA", "GRUP SCOLAR", "GRUPUL SCOLAR", "MUN\\.", "MUN ",
                       "TEOLOGIC", "ROMANIA", "MUNICIPIUL", "ORAS", "ORASUL",
                       "SAT ", "SATUL", "COM\\.", "COMUNA", "LOC\\.", "LOCALITATEA",
                       "SEMINARUL", "CENTRUL DE STUDII", "TEHNOLOGIC", "TEORETIC",
                       "TEORETIC DE STUDII", "SPECIAL PENTRU", "VOCATIONAL")
    
    #unitate_de_invatamant clean
    hs_list_temp_matched$name_1_clean<-hs_list_temp_matched$unitate_de_invatamant
    hs_list_temp_matched$name_1_clean<-str_remove_all(hs_list_temp_matched$name_1_clean,hs_list_temp_matched$town_hs_bac)
    hs_list_temp_matched$name_1_clean<-gsub('[[:punct:] ]+',' ',removeWords(hs_list_temp_matched$name_1_clean,words_to_remove))
    hs_list_temp_matched$name_1_clean<-ifelse(trimws(hs_list_temp_matched$name_1_clean)==''|is.na(hs_list_temp_matched$name_1_clean),hs_list_temp_matched$name_1,hs_list_temp_matched$name_1_clean)
    
    
    #school_harmonized clean
    hs_list_temp_matched$name_2_clean<-hs_list_temp_matched$school_harmonized
    hs_list_temp_matched$name_2_clean<-str_remove_all(hs_list_temp_matched$name_2_clean,hs_list_temp_matched$town_hs_bac)
    hs_list_temp_matched$name_2_clean<-gsub('[[:punct:] ]+',' ',removeWords(hs_list_temp_matched$name_2_clean,words_to_remove))
    hs_list_temp_matched$name_2_clean<-ifelse(trimws(hs_list_temp_matched$name_2_clean)==''|is.na(hs_list_temp_matched$name_2_clean),hs_list_temp_matched$name_2,hs_list_temp_matched$name_2_clean)
    
    
    #liceu_repartizat_corresponding_to_unitate_de_invatamant clean
    hs_list_temp_matched$name_3_clean<-hs_list_temp_matched$liceu_repartizat_from_hs_list
    hs_list_temp_matched$name_3_clean<-str_remove_all(hs_list_temp_matched$name_3_clean,hs_list_temp_matched$town_hs_bac)
    hs_list_temp_matched$name_3_clean<-gsub('[[:punct:] ]+',' ',removeWords(hs_list_temp_matched$name_3_clean,words_to_remove))
    hs_list_temp_matched$name_3_clean<-ifelse(trimws(hs_list_temp_matched$name_3_clean)==''|is.na(hs_list_temp_matched$name_3_clean),hs_list_temp_matched$name_3,hs_list_temp_matched$name_3_clean)
    
    
    
    ############
    #distance
    hs_list_temp_matched$dist3<-stringdist(hs_list_temp_matched$name_1_clean,hs_list_temp_matched$name_2_clean)
    hs_list_temp_matched$dist4<-stringdist(hs_list_temp_matched$name_1_clean,hs_list_temp_matched$name_2)
    hs_list_temp_matched$dist5<-stringdist(hs_list_temp_matched$name_1_clean,hs_list_temp_matched$name_3)
    hs_list_temp_matched$dist6<-stringdist(hs_list_temp_matched$name_1_clean,hs_list_temp_matched$name_3_clean)
    
    #make sure technical, theory and theology schools are not matched to regular hs's
    technical<-"(LICEUL TEHNIC)|(LICEUL TEHNOLOGIC)|(LICEUL INDUSTRIAL)|(COLEGIUL TEHNIC)|(COLEGIUL AGRICOL)|(COLEGIUL SILVIC)|(LICEUL UCECOM)|(SCOALA PROFESIONALA)|(LICEUL ECONOMIC)|(GRUP SCOLAR)|(GRUPUL SCOLAR)|(GR\\. SC\\.)"
    theoretic<-"(TEORETIC)|(LICEUL PEDAGOGIC)"
    theologic<-"(TEOLOGIC)|(SEMINAR)|(ORTODOX)|(REFORMAT)|(CATOLIC)|(GRECO)|(CRESTIN)|(PENTICOSTAL)|(ADEVNTIST)|(BAPTIST)"
    sports<-"(SPORTIV)|(ATLETISM)"
    private<-"(PARTICULAR)|(PRIVAT)"
    special<-"(SPECIAL)|(INCLUZIV)"
    arts<-"( DANS)|(COREGRAFIE)|(MUZICA)|(DE ARTA)|(DE ARTE)|(PLASTICE)|(PLASTICA)|(ARTISTIC)"
    
    hs_list_temp_matched$dist3<-ifelse((grepl(paste(technical,theoretic,theologic,sports,private,special,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(arts,hs_list_temp_matched$school_harmonized))|
                                         (grepl(arts,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,theologic,sports,private,special,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist3)                                                           
    hs_list_temp_matched$dist3<-ifelse((grepl(paste(technical,theoretic,theologic,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(sports,hs_list_temp_matched$school_harmonized))|
                                         (grepl(sports,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,theologic,private,special,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist3) 
    hs_list_temp_matched$dist3<-ifelse((grepl(paste(technical,theoretic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(theologic,hs_list_temp_matched$school_harmonized))|
                                         (grepl(theologic,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,sports,private,special,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist3)  
    hs_list_temp_matched$dist3<-ifelse((grepl(paste(technical,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(theoretic,hs_list_temp_matched$school_harmonized))|
                                         (grepl(theoretic,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist3)  
    hs_list_temp_matched$dist3<-ifelse((grepl(paste(theoretic,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(technical,hs_list_temp_matched$school_harmonized))|
                                         (grepl(technical,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist3) 
    hs_list_temp_matched$dist3<-ifelse((grepl(paste(theoretic,theologic,sports,private,technical,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(special,hs_list_temp_matched$school_harmonized))|
                                         (grepl(special,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,private,technical,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist3) 
    hs_list_temp_matched$dist3<-ifelse((grepl(paste(theoretic,theologic,sports,special,technical,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(private,hs_list_temp_matched$school_harmonized))|
                                         (grepl(private,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,special,technical,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist3)
    #hs_list_temp_matched$dist3<-ifelse((grepl(paste(theoretic,theologic,technical,sports,private,special,"(LICEUL)","(COLEGIUL)",arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(technical_pre_2012,hs_list_temp_matched$school_harmonized))|
    #                                    (grepl(technical_pre_2012,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,technical,theologic,sports,private,special,arts,"(LICEUL)","(COLEGIUL)",sep="|"),hs_list_temp_matched$school_harmonized)),
    #                                  100,hs_list_temp_matched$dist3) 
    
    
    hs_list_temp_matched$dist4<-ifelse((grepl(paste(technical,theoretic,theologic,sports,private,special,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(arts,hs_list_temp_matched$school_harmonized))|
                                         (grepl(arts,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,theologic,sports,private,special,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist4)                                                           
    hs_list_temp_matched$dist4<-ifelse((grepl(paste(technical,theoretic,theologic,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(sports,hs_list_temp_matched$school_harmonized))|
                                         (grepl(sports,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,theologic,private,special,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist4) 
    hs_list_temp_matched$dist4<-ifelse((grepl(paste(technical,theoretic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(theologic,hs_list_temp_matched$school_harmonized))|
                                         (grepl(theologic,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,sports,private,special,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist4)  
    hs_list_temp_matched$dist4<-ifelse((grepl(paste(technical,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(theoretic,hs_list_temp_matched$school_harmonized))|
                                         (grepl(theoretic,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist4)  
    hs_list_temp_matched$dist4<-ifelse((grepl(paste(theoretic,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(technical,hs_list_temp_matched$school_harmonized))|
                                         (grepl(technical,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist4) 
    hs_list_temp_matched$dist4<-ifelse((grepl(paste(theoretic,theologic,sports,private,technical,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(special,hs_list_temp_matched$school_harmonized))|
                                         (grepl(special,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,private,technical,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist4) 
    hs_list_temp_matched$dist4<-ifelse((grepl(paste(theoretic,theologic,sports,special,technical,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(private,hs_list_temp_matched$school_harmonized))|
                                         (grepl(private,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,special,technical,arts,sep="|"),hs_list_temp_matched$school_harmonized)),
                                       100,hs_list_temp_matched$dist4)
    #hs_list_temp_matched$dist4<-ifelse((grepl(paste(theoretic,theologic,technical,sports,private,special,"(LICEUL)","(COLEGIUL)",arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(technical_pre_2012,hs_list_temp_matched$school_harmonized))|
    #                                    (grepl(technical_pre_2012,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,technical,theologic,sports,private,special,arts,"(LICEUL)","(COLEGIUL)",sep="|"),hs_list_temp_matched$school_harmonized)),
    #                                  100,hs_list_temp_matched$dist4) 
    
    hs_list_temp_matched$dist5<-ifelse((grepl(paste(technical,theoretic,theologic,sports,private,special,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(arts,hs_list_temp_matched$liceu_repartizat_from_hs_list))|
                                         (grepl(arts,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,theologic,sports,private,special,sep="|"),hs_list_temp_matched$liceu_repartizat_from_hs_list)),
                                       100,hs_list_temp_matched$dist5)                                                           
    hs_list_temp_matched$dist5<-ifelse((grepl(paste(technical,theoretic,theologic,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(sports,hs_list_temp_matched$liceu_repartizat_from_hs_list))|
                                         (grepl(sports,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,theologic,private,special,arts,sep="|"),hs_list_temp_matched$liceu_repartizat_from_hs_list)),
                                       100,hs_list_temp_matched$dist5) 
    hs_list_temp_matched$dist5<-ifelse((grepl(paste(technical,theoretic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(theologic,hs_list_temp_matched$liceu_repartizat_from_hs_list))|
                                         (grepl(theologic,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,sports,private,special,arts,sep="|"),hs_list_temp_matched$liceu_repartizat_from_hs_list)),
                                       100,hs_list_temp_matched$dist5)  
    hs_list_temp_matched$dist5<-ifelse((grepl(paste(technical,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(theoretic,hs_list_temp_matched$liceu_repartizat_from_hs_list))|
                                         (grepl(theoretic,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$liceu_repartizat_from_hs_list)),
                                       100,hs_list_temp_matched$dist5)  
    hs_list_temp_matched$dist5<-ifelse((grepl(paste(theoretic,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(technical,hs_list_temp_matched$liceu_repartizat_from_hs_list))|
                                         (grepl(technical,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$liceu_repartizat_from_hs_list)),
                                       100,hs_list_temp_matched$dist5) 
    hs_list_temp_matched$dist5<-ifelse((grepl(paste(theoretic,theologic,sports,private,technical,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(special,hs_list_temp_matched$liceu_repartizat_from_hs_list))|
                                         (grepl(special,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,private,technical,arts,sep="|"),hs_list_temp_matched$liceu_repartizat_from_hs_list)),
                                       100,hs_list_temp_matched$dist5) 
    hs_list_temp_matched$dist5<-ifelse((grepl(paste(theoretic,theologic,sports,special,technical,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(private,hs_list_temp_matched$liceu_repartizat_from_hs_list))|
                                         (grepl(private,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,special,technical,arts,sep="|"),hs_list_temp_matched$liceu_repartizat_from_hs_list)),
                                       100,hs_list_temp_matched$dist5)
    #hs_list_temp_matched$dist5<-ifelse((grepl(paste(theoretic,theologic,technical,sports,private,special,"(LICEUL)","(COLEGIUL)",arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(technical_pre_2012,hs_list_temp_matched$liceu_repartizat_from_hs_list))|
    #                                    (grepl(technical_pre_2012,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,technical,theologic,sports,private,special,arts,"(LICEUL)","(COLEGIUL)",sep="|"),hs_list_temp_matched$liceu_repartizat_from_hs_list)),
    #                                  100,hs_list_temp_matched$dist5) 
    
    
    hs_list_temp_matched$dist6<-ifelse((grepl(paste(technical,theoretic,theologic,sports,private,special,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(arts,hs_list_temp_matched$liceu_repartizat_from_hs_list))|
                                         (grepl(arts,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,theologic,sports,private,special,sep="|"),hs_list_temp_matched$liceu_repartizat_from_hs_list)),
                                       100,hs_list_temp_matched$dist6)                                                           
    hs_list_temp_matched$dist6<-ifelse((grepl(paste(technical,theoretic,theologic,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(sports,hs_list_temp_matched$liceu_repartizat_from_hs_list))|
                                         (grepl(sports,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,theologic,private,special,arts,sep="|"),hs_list_temp_matched$liceu_repartizat_from_hs_list)),
                                       100,hs_list_temp_matched$dist6) 
    hs_list_temp_matched$dist6<-ifelse((grepl(paste(technical,theoretic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(theologic,hs_list_temp_matched$liceu_repartizat_from_hs_list))|
                                         (grepl(theologic,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theoretic,sports,private,special,arts,sep="|"),hs_list_temp_matched$liceu_repartizat_from_hs_list)),
                                       100,hs_list_temp_matched$dist6)  
    hs_list_temp_matched$dist6<-ifelse((grepl(paste(technical,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(theoretic,hs_list_temp_matched$liceu_repartizat_from_hs_list))|
                                         (grepl(theoretic,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(technical,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$liceu_repartizat_from_hs_list)),
                                       100,hs_list_temp_matched$dist6)  
    hs_list_temp_matched$dist6<-ifelse((grepl(paste(theoretic,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(technical,hs_list_temp_matched$liceu_repartizat_from_hs_list))|
                                         (grepl(technical,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,private,special,arts,sep="|"),hs_list_temp_matched$liceu_repartizat_from_hs_list)),
                                       100,hs_list_temp_matched$dist6) 
    hs_list_temp_matched$dist6<-ifelse((grepl(paste(theoretic,theologic,sports,private,technical,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(special,hs_list_temp_matched$liceu_repartizat_from_hs_list))|
                                         (grepl(special,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,private,technical,arts,sep="|"),hs_list_temp_matched$liceu_repartizat_from_hs_list)),
                                       100,hs_list_temp_matched$dist6) 
    hs_list_temp_matched$dist6<-ifelse((grepl(paste(theoretic,theologic,sports,special,technical,arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(private,hs_list_temp_matched$liceu_repartizat_from_hs_list))|
                                         (grepl(private,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,theologic,sports,special,technical,arts,sep="|"),hs_list_temp_matched$liceu_repartizat_from_hs_list)),
                                       100,hs_list_temp_matched$dist6)
    #hs_list_temp_matched$dist6<-ifelse((grepl(paste(theoretic,theologic,technical,sports,private,special,"(LICEUL)","(COLEGIUL)",arts,sep="|"),hs_list_temp_matched$unitate_de_invatamant)& grepl(technical_pre_2012,hs_list_temp_matched$liceu_repartizat_from_hs_list))|
    #                                    (grepl(technical_pre_2012,hs_list_temp_matched$unitate_de_invatamant)&grepl(paste(theoretic,technical,theologic,sports,private,special,arts,"(LICEUL)","(COLEGIUL)",sep="|"),hs_list_temp_matched$liceu_repartizat_from_hs_list)),
    #                                  100,hs_list_temp_matched$dist6) 
    
    
    hs_list_temp_matched$dist6<-ifelse(is.na(hs_list_temp_matched$dist6),100,hs_list_temp_matched$dist6)
    hs_list_temp_matched$dist5<-ifelse(is.na(hs_list_temp_matched$dist5),100,hs_list_temp_matched$dist5)
    
    
    
    
    

    #hs_list_temp_matched<-as.data.frame(append(hs_list_temp_matched, list(dist_min = NA), after = 1))
    hs_list_temp_matched<-hs_list_temp_matched %>% mutate(dist_min=NA) %>% select(dist_min,everything())
    hs_list_temp_matched$dist_min<-pmin(hs_list_temp_matched$dist,hs_list_temp_matched$dist2,hs_list_temp_matched$dist3,hs_list_temp_matched$dist4,hs_list_temp_matched$dist5,hs_list_temp_matched$dist6)
    
    hs_list_temp_matched_refined<-  hs_list_temp_matched %>% mutate(dist_max=dist+dist2+dist3+dist4+dist5+dist6) %>%
      group_by(judet_bac,town_hs_bac,unitate_de_invatamant) %>%
      arrange(dist_min,dist_max) %>% 
      mutate(rank_school1=row_number()) %>%
      group_by(judet_bac,town_hs_bac,school_harmonized_original) %>%
      arrange(dist_min,dist_max) %>% 
      mutate(rank_school2=row_number()) %>%
      arrange(judet_bac,town_hs_bac,unitate_de_invatamant,dist_min,dist_max)
    
    #test2<-hs_list_temp_matched_refined %>% filter(town_hs_bac=='LUPENI')
    # hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% filter(grepl('MRAZEK',unitate_de_invatamant))
    
    for (j in c(1:10)){
      hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>%
        #keep only those matches that are first best for the harmonized school name and 1-2-3 best for the school
        # we are trying to match
        group_by(judet_bac,town_hs_bac,unitate_de_invatamant,liceu_repartizat_corresponding_to_unitate_de_invatamant) %>%
        filter(if(any(rank_school1==1 & rank_school2==1)) rank_school1==1 else rank_school1>0) %>%
        arrange(judet_bac,town_hs_bac,unitate_de_invatamant,dist_min,-dist_max) %>%
        mutate(rank_school1=row_number()) %>%
        #if harmonized school is already matched to a school from the current year, delete all other matches
        group_by(judet_bac,town_hs_bac,school_harmonized_original) %>%
        filter(if(any(rank_school1==1 & rank_school2==1)) rank_school2==1 else rank_school2>0) %>%
        arrange(judet_bac,town_hs_bac,school_harmonized,dist_min,-dist_max) %>% 
        mutate(rank_school2=row_number()) 
    }
    #keep only top matches for each school, both for new schools and old schools; we do not want any double matches
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>%
      group_by(judet_bac,town_hs_bac,unitate_de_invatamant) %>%
      arrange(dist_min,dist) %>%
      slice(1) %>%
      group_by(judet_bac,town_hs_bac,school_harmonized_original) %>%
      arrange(dist_min,dist) %>%
      slice(1) %>%
      arrange(dist_min)

    
    
    
    ####keep only the matched ones who have a distance of at most dist_threshold charcaters; for other ones, consider them unmatched
    hs_list_temp_matched_refined<- hs_list_temp_matched_refined %>% group_by(judet_bac,town_hs_bac) %>% mutate(dist_threshold=max(4,15-(n())^(1/2)),match=dist_min<dist_threshold) %>%
      select(match,dist_threshold,everything())
    
    # hs_list_temp_matched_refined[is.na(hs_list_temp_matched_refined$unitate_de_invatamant),]
    hs_list_temp_matched_refined[which(hs_list_temp_matched_refined$dist_min>=hs_list_temp_matched_refined$dist_threshold),]$school_harmonized_original<-hs_list_temp_matched_refined[which(hs_list_temp_matched_refined$dist_min>=hs_list_temp_matched_refined$dist_threshold),]$unitate_de_invatamant_original
    hs_list_temp_matched_refined$dist<-hs_list_temp_matched_refined$dist_min
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined[,! colnames(hs_list_temp_matched_refined) %in% c("dist_threshold","dist_min","dist2","dist3","dist4","dist5","dist6","name_1","name_2","name_3","name_1_clean","name_2_clean","name_3_clean","match","dist_max","rank_school1","rank_school2","liceu_repartizat_from_hs_list") ]
    
    #restore original names of HS's
    hs_list_temp_matched_refined$unitate_de_invatamant<-hs_list_temp_matched_refined$unitate_de_invatamant_original
    hs_list_temp_matched_refined$school_harmonized<-hs_list_temp_matched_refined$school_harmonized_original
    
    
    #get the hs's from the original list who are not in match_refined(i.e. we could not find a match for them)
    
    hs_list_temp_unmatched<-anti_join(hs_list_temp[,c("judet_bac","town_hs_bac","unitate_de_invatamant","yr","liceu_repartizat_corresponding_to_unitate_de_invatamant")],
                                      hs_list_temp_matched_refined[,c("judet_bac","town_hs_bac","unitate_de_invatamant","liceu_repartizat_corresponding_to_unitate_de_invatamant")],
                                      by=c("judet_bac","town_hs_bac","unitate_de_invatamant","liceu_repartizat_corresponding_to_unitate_de_invatamant"))
    
    #get rid of the "original" column copies
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined[,! colnames(hs_list_temp_matched_refined) %in% c("unitate_de_invatamant_original","school_harmonized_original") ]
    
    #some exceptions
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl("TECLU",unitate_de_invatamant) & judet_bac=="BUCURESTI",'LICEUL TEHNOLOGIC "NICOLAE TECLU"',school_harmonized),
             dist=ifelse(grepl("TECLU",unitate_de_invatamant) & judet_bac=="BUCURESTI",0,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl("GRUPUL SCOLAR DE COOPERATIE ALBA IULIA",unitate_de_invatamant),'GRUPUL SCOLAR DE COOPERATIE ALBA IULIA',school_harmonized),
             dist=ifelse(grepl("GRUPUL SCOLAR DE COOPERATIE ALBA IULIA",unitate_de_invatamant),0,dist))
    # hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
    #   mutate(school_harmonized=ifelse(grepl("GRUPUL SCOLAR DE COOPERATIE ALBA IULIA",unitate_de_invatamant),'GRUP SCOLAR INDUSTRIAL "AVRAM IANCU" ZLATNA',school_harmonized),
    #          dist=ifelse(grepl('GRUP SCOLAR INDUSTRIAL "AVRAM IANCU" ZLATNA',unitate_de_invatamant),0,dist))

    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl("LICEUL TEHNOLOGIC SANTANA",unitate_de_invatamant),'LICEUL TEHNOLOGIC "STEFAN HELL" SANTANA',school_harmonized),
             dist=ifelse(grepl('LICEUL TEHNOLOGIC SANTANA',unitate_de_invatamant),0,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl("LICEUL CU PROGRAM SPORTIV PITESTI",unitate_de_invatamant),'LICEUL CU PROGRAM SPORTIV "VIITORUL" PITESTI',school_harmonized),
             dist=ifelse(grepl('LICEUL CU PROGRAM SPORTIV PITESTI',unitate_de_invatamant),0,dist))
    # hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
    #   mutate(school_harmonized=ifelse(grepl('GRUP SCOLAR "ION CANTACUZINO" PITESTI',unitate_de_invatamant),'GRUP SCOLAR "ION CANTACUZINO" PITESTI',school_harmonized),
    #          dist=ifelse(grepl('GRUP SCOLAR "ION CANTACUZINO" PITESTI',unitate_de_invatamant),-2,dist))
    # hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
    #   mutate(school_harmonized=ifelse(grepl('GRUPUL SCOLAR FORESTIER ARAD',unitate_de_invatamant),'GRUPUL SCOLAR FORESTIER ARAD',school_harmonized),
    #          dist=ifelse(grepl('GRUPUL SCOLAR FORESTIER ARAD',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('LICEUL TEHNOLOGIC "LUDOVIC MRAZEK", MUNICIPIUL PLOIESTI',unitate_de_invatamant),'LICEUL TEHNOLOGIC "LUDOVIC MRAZEK", MUNICIPIUL PLOIESTI',school_harmonized),
             dist=ifelse(grepl('LICEUL TEHNOLOGIC "LUDOVIC MRAZEK", MUNICIPIUL PLOIESTI',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('"SABIN DRAGOI" DEVA',unitate_de_invatamant),'COLEGIUL NATIONAL "SABIN DRAGOI" DEVA',school_harmonized),
             dist=ifelse(grepl('"SABIN DRAGOI" DEVA',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('LICEUL WALDORF CLUJ-NAPOCA',unitate_de_invatamant),'LICEUL WALDORF CLUJ-NAPOCA',school_harmonized),
             dist=ifelse(grepl('LICEUL WALDORF CLUJ-NAPOCA',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('LICEUL DE ARTE ALBA IULIA',unitate_de_invatamant),'LICEUL DE ARTE "REGINA MARIA" ALBA IULIA',school_harmonized),
             dist=ifelse(grepl('LICEUL DE ARTE ALBA IULIA',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('LICEUL TEORETIC "GHEORGHE MAGHERU" CETATE',unitate_de_invatamant),'LICEUL TEORETIC "GHEORGHE MAGHERU" CETATE',school_harmonized),
             dist=ifelse(grepl('LICEUL TEORETIC "GHEORGHE MAGHERU" CETATE',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('LICEUL "PETRU RARES" BISTRITA',unitate_de_invatamant),'LICEUL "PETRU RARES" BISTRITA',school_harmonized),
             dist=ifelse(grepl('LICEUL "PETRU RARES" BISTRITA',unitate_de_invatamant),-2,dist))
    # hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
    #   mutate(school_harmonized=ifelse(grepl('GRUPUL SCOLAR "SAMUS" CLUJ-NAPOCA',unitate_de_invatamant),'GRUPUL SCOLAR "SAMUS" CLUJ-NAPOCA',school_harmonized),
    #          dist=ifelse(grepl('GRUPUL SCOLAR "SAMUS" CLUJ-NAPOCA',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('LICEUL TEORETIC SANITAR BAIA MARE',unitate_de_invatamant),'LICEUL TEORETIC SANITAR BAIA MARE',school_harmonized),
             dist=ifelse(grepl('LICEUL TEORETIC SANITAR BAIA MARE',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('LICEUL TEHNOLOGIC "NICOLINA", IASI',unitate_de_invatamant),'LICEUL TEHNOLOGIC "NICOLINA", IASI',school_harmonized),
             dist=ifelse(grepl('LICEUL TEHNOLOGIC "NICOLINA", IASI',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('LICEUL CONCORD BIRLAD',unitate_de_invatamant),'LICEUL CONCORD BIRLAD',school_harmonized),
             dist=ifelse(grepl('LICEUL CONCORD BIRLAD',unitate_de_invatamant),-2,dist))
    # hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
    #   mutate(school_harmonized=ifelse(grepl('GRUP SCOLAR INDUSTRIAL "TEHNOFRIG" CLUJ-NAPOCA',unitate_de_invatamant),'GRUP SCOLAR INDUSTRIAL "TEHNOFRIG" CLUJ-NAPOCA',school_harmonized),
    #          dist=ifelse(grepl('GRUP SCOLAR INDUSTRIAL "TEHNOFRIG" CLUJ-NAPOCA',unitate_de_invatamant),-2,dist))
    # hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
    #   mutate(school_harmonized=ifelse(grepl('GRUP SCOLAR MATERIAL RULANT "UNIREA" CLUJ-NAPOCA',unitate_de_invatamant),'GRUP SCOLAR MATERIAL RULANT "UNIREA" CLUJ-NAPOCA',school_harmonized),
    #          dist=ifelse(grepl('GRUP SCOLAR MATERIAL RULANT "UNIREA" CLUJ-NAPOCA',unitate_de_invatamant),-2,dist))
    # hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
    #   mutate(school_harmonized=ifelse(grepl('GRUPUL SCOLAR AGROMONTAN "TESILA" VALEA DOFTANEI',unitate_de_invatamant),'LICEUL TEHNOLOGIC "CAROL I", COMUNA VALEA DOFTANEI',school_harmonized),
    #          dist=ifelse(grepl('GRUPUL SCOLAR AGROMONTAN "TESILA" VALEA DOFTANEI',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('GRUP SCOLAR SILVIC TIMISOARA',unitate_de_invatamant),'COLEGIUL DE SILVICULTURA SI AGRICULTURA "CASA VERDE" MUN. TIMISOARA',school_harmonized),
             dist=ifelse(grepl('GRUP SCOLAR SILVIC TIMISOARA	',unitate_de_invatamant),-2,dist))
    # hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
    #   mutate(school_harmonized=ifelse(grepl('GRUPUL SCOLAR INDUSTRIAL DE PETROL "TELEAJEN" PLOIESTI',unitate_de_invatamant),'LICEUL TEHNOLOGIC "LUDOVIC MRAZEK", MUNICIPIUL PLOIESTI',school_harmonized),
    #          dist=ifelse(grepl('GRUPUL SCOLAR INDUSTRIAL DE PETROL "TELEAJEN" PLOIESTI',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('LICEUL DE INFORMATICA PETROSANI',unitate_de_invatamant),'COLEGIUL NATIONAL DE INFORMATICA "CARMEN SYLVA" PETROSANI',school_harmonized),
             dist=ifelse(grepl('LICEUL DE INFORMATICA PETROSANI',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('COLEGIUL TEHNIC "ASTRA" BRASOV',unitate_de_invatamant),'COLEGIUL TEHNIC "ASTRA" BRASOV',school_harmonized),
             dist=ifelse(grepl('COLEGIUL TEHNIC "ASTRA" BRASOV',unitate_de_invatamant),-2,dist)) #IOSIF SILIMON AND ASTRA MRGED TO CREATE TRANSILVANIA COLLEGE
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('COLEGIUL "ION MINCU" DEVA',unitate_de_invatamant),'COLEGIUL "ION MINCU" DEVA',school_harmonized),
             dist=ifelse(grepl('COLEGIUL "ION MINCU" DEVA',unitate_de_invatamant),-2,dist)) 
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('COLEGIUL FORESTIER RAMNICU VALCEA',unitate_de_invatamant),'COLEGIUL FORESTIER RAMNICU VALCEA',school_harmonized),
             dist=ifelse(grepl('COLEGIUL FORESTIER RAMNICU VALCEA',unitate_de_invatamant),-2,dist))
    # hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
    #   mutate(school_harmonized=ifelse(grepl('GRUP SCOLAR CONSTRUCTII DE MASINI "DACIA" PITESTI',unitate_de_invatamant),'GRUP SCOLAR CONSTRUCTII DE MASINI "DACIA" PITESTI',school_harmonized),
    #          dist=ifelse(grepl('GRUP SCOLAR CONSTRUCTII DE MASINI "DACIA" PITESTI',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('LICEUL CONCORD ARAD',unitate_de_invatamant),'LICEUL CONCORD ARAD',school_harmonized),
             dist=ifelse(grepl('LICEUL CONCORD ARAD',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('LICEUL TEORETIC "MUNTELE DE FOC" CAMPENITA',unitate_de_invatamant),'LICEUL TEORETIC "DR. LIND" CAMPENITA',school_harmonized),
             dist=ifelse(grepl('LICEUL TEORETIC "MUNTELE DE FOC" CAMPENITA',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('LICEUL "DUMITRU MURGU" IASI',unitate_de_invatamant),'LICEUL "DUMITRU MURGU" IASI',school_harmonized),
             dist=ifelse(grepl('LICEUL "DUMITRU MURGU" IASI',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('GRUP SCOLAR "GRIGORE MOISIL" BISTRITA',unitate_de_invatamant),'GRUP SCOLAR "GRIGORE MOISIL" BISTRITA',school_harmonized),
             dist=ifelse(grepl('GRUP SCOLAR "GRIGORE MOISIL" BISTRITA',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('COLEGIUL TEHNIC TEXTIL SIBIU',unitate_de_invatamant),'COLEGIUL TEHNIC TEXTIL SIBIU',school_harmonized),
             dist=ifelse(grepl('COLEGIUL TEHNIC TEXTIL SIBIU',unitate_de_invatamant),-2,dist))
    # hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
    #   mutate(school_harmonized=ifelse(grepl('GRUPUL SCOLAR INDUSTRIAL "LETEA" BACAU',unitate_de_invatamant),'GRUPUL SCOLAR INDUSTRIAL "LETEA" BACAU',school_harmonized),
    #          dist=ifelse(grepl('GRUPUL SCOLAR INDUSTRIAL "LETEA" BACAU',unitate_de_invatamant),-2,dist))
    # hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
    #   mutate(school_harmonized=ifelse(grepl('GRUP SCOLAR FORESTIER TG-JIU',unitate_de_invatamant),'GRUP SCOLAR FORESTIER TG-JIU',school_harmonized),
    #          dist=ifelse(grepl('GRUP SCOLAR FORESTIER TG-JIU',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('COLEGIUL TEHNIC "ION MINCU" CRAIOVA',unitate_de_invatamant),'COLEGIUL TEHNIC "ION MINCU" CRAIOVA',school_harmonized),
             dist=ifelse(grepl('COLEGIUL TEHNIC "ION MINCU" CRAIOVA',unitate_de_invatamant),-2,dist))
    # hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
    #   mutate(school_harmonized=ifelse(grepl('GRUPUL SCOLAR INDUSTRIAL "LETEA" BACAU',unitate_de_invatamant),'GRUPUL SCOLAR INDUSTRIAL "LETEA" BACAU',school_harmonized),
    #          dist=ifelse(grepl('GRUPUL SCOLAR INDUSTRIAL "LETEA" BACAU',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('COLEGIUL TEHNIC FEROVIAR BRASOV',unitate_de_invatamant),'COLEGIUL TEHNIC FEROVIAR BRASOV',school_harmonized),
             dist=ifelse(grepl('COLEGIUL TEHNIC FEROVIAR BRASOV',unitate_de_invatamant),-2,dist))
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% 
      mutate(school_harmonized=ifelse(grepl('LICEUL WALDORF SIBIU',unitate_de_invatamant),'LICEUL WALDORF SIBIU',school_harmonized),
             dist=ifelse(grepl('LICEUL WALDORF SIBIU',unitate_de_invatamant),-2,dist))
    
    
    hs_list_temp_matched_refined<-hs_list_temp_matched_refined %>% arrange(judet_bac,town_hs_bac)

    
    
    
    
    
    
    
    
    
    
    test2<-hs_list %>% filter(dist>3,school_harmonized!=unitate_de_invatamant)
    test<-hs_list %>% filter(grepl("POPESTI",unitate_de_invatamant))
    test<-hs_list %>% filter(grepl("SAT",unitate_de_invatamant) & grepl("COMUNA",unitate_de_invatamant))
    test<-hs_list %>% filter(grepl("TEXTIL",unitate_de_invatamant)  & grepl("SIBIU",town_hs_bac))
    test<-hs_list %>% filter(grepl("CETATE",town_hs_bac))
    
    
    
    if (dim(hs_list_temp_unmatched)[1]>0){
      hs_list_temp_unmatched$school_harmonized<-hs_list_temp_unmatched$unitate_de_invatamant
      #hs_list_temp_unmatched$MATCH_TEST<-hs_list_temp_unmatched$school_harmonized
      hs_list_temp_unmatched$dist<--2
      hs_list_temp<-rbind(as.data.frame(hs_list_temp_matched_refined),as.data.frame(hs_list_temp_unmatched))
    } else {
      hs_list_temp<- as.data.frame(hs_list_temp_matched_refined)
    }
    hs_list_temp<-hs_list_temp %>% arrange(dist)
    
    hs_list<-rbind(hs_list,hs_list_temp)
    hs_list_deduplicated<-hs_list %>% group_by(judet_bac,town_hs_bac,school_harmonized) %>% slice(which.min(dist)) 
    #add towns that are missing in previous yr
    towns<-unique(hs_list[,c("judet_bac","town_hs_bac")])
    
    print("hs name harmonized")
  
  }

  #saveRDS(hs_list, file = "hs_list.rds")
  
  
  
  #add HS_ID and school_harmonized to each year
  for (i in c(years)){
    print(i)
    setwd(wd_data_intermediate)
    graduation_file_final<-paste("data_merged_complete_town_",i,sep="")
    data_bac_raw<-readRDS(graduation_file_final)
    print('Number of rows before and after')
    print(dim(data_bac_raw)[1])
    
    ###Add HS's to original data
    data_bac_raw<-base::merge(data_bac_raw,hs_list[c("judet_bac","town_hs_bac","unitate_de_invatamant","school_harmonized","liceu_repartizat_corresponding_to_unitate_de_invatamant","yr")],by.x=c("judet_bac","town_hs_bac","unitate_de_invatamant","liceu_repartizat_corresponding_to_unitate_de_invatamant","an"),by.y=c("judet_bac","town_hs_bac","unitate_de_invatamant","liceu_repartizat_corresponding_to_unitate_de_invatamant","yr"),all.x=T)
    #data_bac_raw<-base::merge(data_bac_raw,hs_list[c("judet_bac","town_hs_bac","unitate_de_invatamant","school_harmonized","yr")],by.x=c("judet_bac","town_hs_bac","unitate_de_invatamant","an"),by.y=c("judet_bac","town_hs_bac","unitate_de_invatamant","yr"),all.x=T)
    print(dim(data_bac_raw)[1])
    
    setwd(wd_data_intermediate)
    graduation_file_final<-paste("data_merged_complete_town_harmonized_",i,sep="")
    saveRDS(data_bac_raw, file = graduation_file_final)
    message(paste('Observations after:',nrow(data_bac_raw)))
    print("harmonized name added to data")
  }
  
  
  #Some checks to see if results make sense
  #1. Look at all matches with high distance to see if they make sense
  #test<-hs_list_temp_matched[hs_list_temp_matched$dist>5,]
  #test<-test[order(test$dist),]
  
  #hs_list<-readRDS(paste("hs_list",".rds",sep=""))
  #Find when certain hs's were founded
  #n_hs_per_town_year<-hs_list %>% 
  #  group_by(judet_bac,town_hs_bac,school_harmonized,yr) %>% 
  #  summarize(n=n()) %>%
  #  spread(yr,n) 
  
  #hs_list<-hs_list %>% arrange(judet_bac,town_hs_bac,school_harmonized,yr)
  #test<-hs_list %>% arrange(dist)
  #test2<-hs_list %>% arrange(dist) %>% filter(town_hs_bac=="LUPENI") %>% arrange(yr)
}
